knitr::opts_chunk$set(
  echo = TRUE,
  warning = FALSE,
  message = FALSE)
knitr::opts_knit$set(root.dir = '~/Documents/NCBA/species/')

This demo requires the tidyverse packages.

library(tidyverse)

Load the NCBA functions because this function relies upon the output from some of them.

setwd("~/Code/NCBA/resources")
source("ncba_functions.R")
config <- "~/Documents/NCBA/Scripts/ncba_config.R"

Purpose

This document details a function that generates a boxplot of breeding codes with some customization options and supports the ability to view the checklist behind individual data points by clicking on them in the figure. The function offers several options as parameters.

species – common name of the species data – data frame of ebird or NCBA data type – whether to create an interactive plot that supports opening checklist URLs by clicking on data points in the figure, a non-interactive plot, or a non-interactive plot separated by ecoregion. pallet – specify an RColorBrewer pallet (multiple colors), or a single color (name or hex) for the figure. omit_codes – specify evidence codes not be plotted. lump – an option to lump breeding codes into fewer categories. drop – TRUE or FALSE whether to include unreported codes in the plot

Function Definition

breeding_boxplot
## function (species, data, type = "interactive", pallet = "Paired", 
##     omit_codes = NULL, lump = NULL, drop = TRUE, cex.x.axis = 0.9, 
##     cex.y.axis = 0.8, subtitle = NULL) 
## {
##     library(lubridate)
##     library(grid)
##     library(gridBase)
##     library(RColorBrewer)
##     library(ggiraph)
##     library(ggplot2)
##     ebird <- data
##     ebird["breeding_code"][ebird["breeding_code"] == ""] <- "NULL"
##     ebird$observation_date <- sub("^20\\d\\d", "2050", ebird$observation_date)
##     ebird$breeding_code <- trimws(ebird$breeding_code)
##     ebird$obsdate <- as.Date(ebird$observation_date, "%Y-%m-%d")
##     if (is.null(lump) == FALSE) {
##         drop <- TRUE
##     }
##     if (is.null(omit_codes) == FALSE) {
##         drop <- TRUE
##     }
##     codelevels <- c("H", "S", "S7", "M", "T", "P", "C", "B", 
##         "CN", "NB", "A", "N", "DD", "ON", "NE", "FS", "CF", "NY", 
##         "FY", "FL", "PE", "UN", "F", "O", "NC", "NULL")
##     if (is.null("lump") == FALSE) {
##         codelevels <- c(codelevels, names(lump)[!names(lump) %in% 
##             codelevels])
##     }
##     if (!all(ebird$breeding_code %in% codelevels)) {
##         warn <- paste("Not all eBird codes (breeding_code) for", 
##             species, "are in codelevels")
##         warning(warn)
##     }
##     if (drop == FALSE) {
##         missing_codes <- codelevels[!codelevels %in% ebird$breeding_code]
##         missing <- data.frame(matrix(ncol = ncol(ebird), nrow = length(missing_codes)))
##         names(missing) <- names(ebird)
##         missing$breeding_code <- missing_codes
##         ebird <- rbind(ebird, missing)
##         ebird <- ebird %>% mutate(breeding_code = factor(ebird$breeding_code, 
##             levels = codelevels, ordered = TRUE))
##     }
##     if (drop == TRUE) {
##         if (is.null("omit_codes") == FALSE) {
##             ebird <- ebird[!ebird$breeding_code %in% omit_codes, 
##                 ]
##         }
##         for (i in seq_along(lump)) {
##             indx <- ebird$breeding_code %in% lump[[i]]
##             ebird[indx, "breeding_code"] <- names(lump)[i]
##         }
##         codelevels <- codelevels[codelevels %in% ebird$breeding_code]
##         if (is.null("omit_codes") == FALSE) {
##             codelevels <- codelevels[!codelevels %in% omit_codes]
##         }
##         ebird <- ebird %>% mutate(breeding_code = factor(ebird$breeding_code, 
##             levels = codelevels, ordered = TRUE))
##     }
##     if (pallet %in% rownames(brewer.pal.info)) {
##         n <- brewer.pal.info[pallet, "maxcolors"]
##         codecolors <- colorRampPalette(brewer.pal(n, pallet))(length(codelevels))
##     }
##     else {
##         codecolors <- rep(pallet, length(codelevels))
##     }
##     names(codecolors) <- codelevels
##     ebird$col <- codecolors[ebird$breeding_code]
##     if (type == "non-interactive") {
##         boxplot(obsdate ~ breeding_code, horizontal = TRUE, cex.axis = cex.y.axis, 
##             xaxt = "n", data = ebird, border = "white", main = species, 
##             las = 2, xlab = "Calendar Day", ylab = "Breeding Code", 
##             show.names = TRUE, na.action = na.pass)
##         have_dates <- subset(ebird, is.na(observation_date) == 
##             FALSE)
##         date0 <- round_date(min(have_dates$obsdate), "month")
##         date1 <- round_date(max(have_dates$obsdate), "month")
##         labels <- seq(from = date0, to = date1, by = "month")
##         if (length(unique(month(have_dates$obsdate))) == 1) {
##             labels <- c(min(have_dates$obsdate), max(have_dates$obsdate))
##             labels <- unique(labels)
##         }
##         else {
##             if (nrow(have_dates) > 1 && length(labels) == 1) {
##                 labels <- unique(c(min(have_dates$obsdate), max(have_dates$obsdate)))
##             }
##         }
##         names(labels) <- format(labels, "%b %d")
##         vps <- baseViewports()
##         pushViewport(vps$inner, vps$figure, vps$plot)
##         grid.text(names(labels), x = unit(labels, "native"), 
##             y = unit(-0.7, "lines"), just = "right", rot = 65, 
##             gp = gpar(cex = cex.x.axis))
##         popViewport(3)
##         axis(1, labels, labels = FALSE)
##         col <- unique(ebird$col)
##         stripchart(obsdate ~ breeding_code, data = ebird, vertical = FALSE, 
##             method = "jitter", pch = 16, col = col, add = TRUE, 
##             na.action = na.pass)
##         boxplot(obsdate ~ breeding_code, horizontal = TRUE, col = "#F5F5F500", 
##             yaxt = "n", xaxt = "n", data = ebird, add = TRUE, 
##             na.action = na.pass)
##     }
##     if (type == "ecoregional") {
##         fields <- c("ID_BLOCK", "ID_EBD_NAME", "ECOREGION", "COUNTY", 
##             "ID_WEB_BLOCKMAP")
##         blocks <- get_blocks(ncba_config = config, spatial = FALSE, 
##             fields = fields, crs = 4326)
##         records2 <- left_join(ebird, blocks, by = c(ncba_block = "ID_EBD_NAME")) %>% 
##             filter(is.na(ECOREGION) == FALSE)
##         records2$ECOREGION[records2$ECOREGION == "CP"] <- "Coastal Plain"
##         records2$ECOREGION[records2$ECOREGION == "P"] <- "Piedmont"
##         records2$ECOREGION[records2$ECOREGION == "M"] <- "Mountains"
##         records2$ECOREGION <- factor(records2$ECOREGION, levels = c("Coastal Plain", 
##             "Piedmont", "Mountains"))
##         result <- ggplot(data = records2) + geom_boxplot(aes(x = obsdate, 
##             y = breeding_code)) + facet_wrap(~ECOREGION, nrow = 3) + 
##             labs(y = "Breeding Code", x = "Calendar Day")
##         plot(result)
##     }
##     if (type == "interactive") {
##         ebird$front <- "https://ebird.org/checklist/"
##         ebird$ChecklistLink <- with(ebird, paste0(front, sampling_event_identifier))
##         gg_point = ggplot(data = ebird) + labs(y = "Breeding Code", 
##             x = "Calendar Day") + geom_boxplot(aes(x = obsdate, 
##             y = breeding_code)) + geom_point_interactive(aes(x = obsdate, 
##             y = breeding_code, color = col, tooltip = obsdate, 
##             data_id = obsdate, onclick = paste0("window.open(\"", 
##                 ChecklistLink, "\", \"_blank\")")), show.legend = FALSE, 
##             position = position_jitter(width = 0.2, height = 0.2)) + 
##             theme_minimal() + labs(title = species)
##         girafe(ggobj = gg_point, width_svg = 10, options = list(opts_sizing(rescale = TRUE)))
##     }
## }

Usage

Identify a species to investigate.

species <- "Loggerhead Shrike"
print(species)
## [1] "Loggerhead Shrike"

Retrieve the records for the species from the Atlas Cache

time1 <- proc.time()

nc_data <- get_observations(species, NCBA_only = TRUE, EBD_fields_only = FALSE,
                            ncba_config = config)

# format columns to the standard analysis format (ebd format)
records <- to_EBD_format(nc_data, drop=FALSE)

# Calculate processing time
mongotime <- proc.time() - time1

# Print number of records returned
print(paste("Records returned:", nrow(records)))
## [1] "Records returned: 1304"
print(paste("Runtime: ", mongotime[["elapsed"]]))
## [1] "Runtime:  6.239"

Make a non-interactive boxplot without lumping codes together

breeding_boxplot(species, records, type = "non-interactive")

Make an interactive boxplot without lumping codes together

breeding_boxplot(species, records)

Create an interactive boxplot with some of the codes lumped together.

lump <- list(S = c("S", "S7", "M"), O = c("NULL", "F", "O", "NC"))

breeding_boxplot(species, records, lump=lump)

Create an interactive boxplot that omits some codes and uses a different colormap.

omit <- c("S7", "M", "NC", "H", "T", "P", "C", "CN", "N", "A", "NB", 
          "FS")
breeding_boxplot(species, records, type = "interactive", pallet="Set3",
                     omit_codes=omit, lump=NULL, drop=TRUE)

Make a figure that has a boxplot for each ecoregion.

breeding_boxplot(species = species, data = records, type = "ecoregional",
                     lump = breeding_codes(lumped = TRUE))

Tests

Fully programmed tests are not feasible because the function returns a figure, but we can summarize the aspects of the dataset behind the figure and visually compare them to what the figure shows. From the cells above, the input data frame is named “records”.

  1. Lump all codes into two categories as an extreme example to make it clear that they work. Use category names that aren’t breeding codes in order to show they can be handled.
lump <- list(X = c("S", "S7", "M", "H", "NULL", "P", "F", "UN", "PE", "NY"), 
             Z = c("A", "N", "CN", "T", "NB","FS", "FL", "CF", "FY", "C",
                   "ON", "NE", "B"))

breeding_boxplot(species, records, type = "non-interactive", lump=lump)

breeding_boxplot(species, records, type = "interactive", lump=lump)
breeding_boxplot(species, records, type = "ecoregional", lump=lump)

A standard lumping list is available for the primary categories.

lumped_codes <- breeding_codes()
breeding_boxplot(species, records, type = "interactive", lump=lumped_codes)
  1. Omit all but three codes as an extreme example to make sure things work.
omit <- c("S", "M", "H", "NULL", "P", "A", "N", "CN", "T", "NB","FS", "CF", "C")

breeding_boxplot(species, records, type = "non-interactive", omit_codes = omit)

breeding_boxplot(species, records, type = "interactive", omit_codes = omit)
breeding_boxplot(species, records, type = "ecoregional", omit_codes = omit)

  1. Check that the pallet argument works by assigning a non-default value.
breeding_boxplot(species, records, pallet="Spectral")
breeding_boxplot(species, records, pallet="Paired")
breeding_boxplot(species, records, type="non-interactive", pallet="Spectral")

breeding_boxplot(species, records, type="non-interactive", pallet="Paired")

  1. Check that all breeding codes that are present in the figure when drop is set to “FALSE”.
breeding_boxplot(species, records, type="non-interactive", drop=FALSE)

breeding_boxplot(species, records, drop=FALSE)
breeding_boxplot(species, records, type="ecoregional", drop=FALSE)

  1. Check that all breeding codes that are present in records are included in the y-axis, when drop is set to “TRUE”. Note that “” is replaced with “NULL” by the function.
# Print the unique breeding codes in records and the figure
print(unique(records$breeding_code))
##  [1] ""   "P"  "S"  "ON" "H"  "CF" "FY" "CN" "FL" "N"  "PE" "S7" "A"  "NB" "F" 
## [16] "NY" "T"
breeding_boxplot(species, records, type="interactive", drop=TRUE)
breeding_boxplot(species, records, type="non-interactive", drop=TRUE)

breeding_boxplot(species, records, type="ecoregional", drop=TRUE)

Speed Test

Run the function several times to get descriptive statistics.

# Run the function 10 times and record the runtime
time <- c()
for (i in 1:10) {
  time1 <- proc.time()
  breeding_boxplot(species, records, type = "non-interactive")
  t <- proc.time() - time1
  time[i] <- t["elapsed"]
}

# Print the descriptive statistics
print(summary(time))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0200  0.0320  0.0320  0.0316  0.0330  0.0360

Results of the speed test were…

Min. 1st Qu. Median Mean 3rd Qu. Max. 0.0220 0.0240 0.0250 0.0261 0.0260 0.0470